Import des librairies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.image import imread
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

%matplotlib inline
pd.set_option('display.max_columns', 100)

import missingno as msno
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly

import cv2
import os
from os import path

from PIL import Image

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics 

from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

import time

import warnings
warnings.filterwarnings('ignore')

Import dataset

In [2]:
data = pd.read_csv("Flipkart/data_cleaned.csv", index_col=0)
df_tsne_vgg = pd.read_csv("Flipkart/im_features_vgg.csv", index_col=0)
df_sift = pd.read_csv("Flipkart/im_features_sift.csv", index_col=0)
df = pd.read_csv("Flipkart/ari_im.csv", index_col=0)
df_use = pd.read_csv("Flipkart/df_use.csv", index_col=0)

Regroupement Text et Images SIFT

In [3]:
text_features_final = df_use
image_features_sift = df_sift
all_features_sift = pd.concat([text_features_final, image_features_sift], axis=1)
In [4]:
print("text_features shape :", text_features_final.shape) 
print("image_features shape :", image_features_sift.shape) 
print("all_features shape :", all_features_sift.shape)
text_features shape : (1050, 512)
image_features shape : (1050, 832)
all_features shape : (1050, 1344)
In [5]:
def pca(vector):
    pca = PCA(n_components=0.99)
    ft_pca = pca.fit_transform(vector)
    
    return ft_pca

PCA

In [6]:
pca_all_sift = pca(all_features_sift)

T-SNE

In [7]:
tsne = TSNE(n_components=2, verbose=1, perplexity=80,n_iter=5000, learning_rate=200, random_state=42)
In [8]:
X_tsne_siftt = tsne.fit_transform(pca_all_sift)

df_tsne_sift = pd.DataFrame(X_tsne_siftt, columns=['tsne1', 'tsne2'])
print(df_tsne_sift.shape)
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.016s...
[t-SNE] Computed neighbors for 1050 samples in 1.214s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.401698
[t-SNE] KL divergence after 250 iterations with early exaggeration: 58.000778
[t-SNE] KL divergence after 2350 iterations: 0.470344
(1050, 2)
In [9]:
list_ari = []
def plot_kmeans_tsne(reduction, title, filename, colname):

    kmeans_tsne = KMeans(n_clusters=7, n_init=50, max_iter=200,init='k-means++', random_state=42).fit(reduction)
    labels_tsne = kmeans_tsne.labels_
    cl_tsne = pd.concat([reduction,pd.DataFrame({'tsne_clusters':labels_tsne})],axis=1)
    
    data[f'cluster {colname}'] = labels_tsne
    categories_predict = data[f'cluster {colname}']
    categories_true = data['product_category_1']
    adjusted_rand = metrics.adjusted_rand_score(categories_true, categories_predict)
    list_ari.append(adjusted_rand)
    print("\033[1mAdjusted Rand Index: %0.3f\033[0m" % adjusted_rand)
    
    fig = px.scatter(data, x=cl_tsne.iloc[:,0], y = cl_tsne.iloc[:,1], color=categories_true, title=f"Représentation selon les vraies classes {title}")
    
    fig1 = px.scatter(data, x = cl_tsne.iloc[:,0],y = cl_tsne.iloc[:,1], color=categories_predict, title = f"Représentation selon les clusters {title}")
    
    plotly.offline.plot(fig, filename=f'plots/{filename}.html')
    plotly.offline.plot(fig1, filename=f'plots/{filename}_cluster.html')

    return fig.show(), fig1.show()
In [10]:
plot_kmeans_tsne(df_tsne_sift, "Clusters Sift lemmatize", "Sift_lemmatize", "sift_lemmatize")
Adjusted Rand Index: 0.469
Out[10]:
(None, None)
In [11]:
fig, ax = plt.subplots(figsize=(15,15))
plt.title('Visualisation Prédiction avec entrainement Texte et Image', fontweight='bold')
ax.scatter(X_tsne_siftt[data.index, 0], X_tsne_siftt[data.index, 1])
for x0, y0, path in zip(X_tsne_siftt[data.index, 0], X_tsne_siftt[data.index, 1],
                        (f'Flipkart/Images/' + data.image)):
    ab = AnnotationBbox(OffsetImage(plt.imread(path), zoom=0.025), (x0, y0), frameon=False)
    ax.add_artist(ab)
plt.xlabel('TSNE 1')
plt.ylabel('TSNE 2')
plt.show()
In [12]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster sift_lemmatize'] == x].index
             for x in data['cluster sift_lemmatize'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)

Regroupement Text et Images VGG

In [13]:
image_features_VGG = df_tsne_vgg
all_features_VGG = pd.concat([text_features_final, image_features_VGG], axis=1)
In [14]:
print("text_features shape :", text_features_final.shape) 
print("image_features shape :", image_features_VGG.shape) 
print("all_features shape :", all_features_VGG.shape)
text_features shape : (1050, 512)
image_features shape : (1050, 7)
all_features shape : (1050, 519)
In [15]:
pca_all_VGG = pca(all_features_VGG)
In [16]:
X_tsne_VGG = tsne.fit_transform(pca_all_VGG)

df_tsne_VGG = pd.DataFrame(X_tsne_VGG, columns=['tsne1', 'tsne2'])
print(df_tsne_VGG.shape)
[t-SNE] Computing 241 nearest neighbors...
[t-SNE] Indexed 1050 samples in 0.007s...
[t-SNE] Computed neighbors for 1050 samples in 0.430s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1050
[t-SNE] Computed conditional probabilities for sample 1050 / 1050
[t-SNE] Mean sigma: 0.489718
[t-SNE] KL divergence after 250 iterations with early exaggeration: 40.195015
[t-SNE] KL divergence after 2450 iterations: 0.195026
(1050, 2)
In [17]:
plot_kmeans_tsne(df_tsne_VGG, "Clusters VGG lemmatize", "vgg_lemmatize", "vgg_lemmatize")
Adjusted Rand Index: 0.471
Out[17]:
(None, None)
In [18]:
# Analyse des différentes catégories dans les labels
index_tot = [data[data['cluster vgg_lemmatize'] == x].index
             for x in data['cluster vgg_lemmatize'].value_counts().index]

plt.figure(figsize=(20, 20))
for x in range(len(index_tot)):
    order = data.loc[index_tot[x], 'product_category_1'].value_counts()
    order_hue = order.index
    plt.subplot(4, len(index_tot)/3, x+1)
    sns.countplot(y=data.loc[index_tot[x], 'product_category_1'],
                  order=order_hue,
                  palette='Blues_r')
    plt.title(f"Cluster {x}", fontsize=20)

Comparaison des différentes méthodes de classification

In [19]:
df_ari=pd.DataFrame([list_ari]
                    ,columns=['sift_lem','vgg_lem'],
                    index=['ARI_SCORE'])
In [20]:
df_ari.T.round(2).plot(kind="bar",figsize=(10,6))
plt.xlabel("Model")
plt.ylabel("ARI Score")
Out[20]:
Text(0, 0.5, 'ARI Score')
In [21]:
df = df.join(df_ari, how="inner")
In [22]:
df.T.round(2).plot(kind="bar",figsize=(10,6))
plt.xlabel("Model")
plt.ylabel("ARI Score")
Out[22]:
Text(0, 0.5, 'ARI Score')

Classification

Matrice de confusion

In [23]:
# Create crosstab: ct
ct = pd.crosstab(data['product_category_1'], data['cluster vgg_lemmatize'])

# plot the heatmap
plt.figure(figsize = (10,7))
ax = sns.heatmap(ct, annot=True, fmt='g')
ax.set_xlabel('Clusters')
ax.set_ylabel('Categories')

# fix a displaying problem
ax.set_ylim(ax.get_ylim()[0]+0.5, ax.get_ylim()[1]-0.5);

Certaines catégories ont leurs produits qui sont correctements classés. D'autres catégories n'ont pas pu être associées clairement à un cluster comme la catégorie "Home Furnishing ou la catégorie "Kitchen & Dining".

Il est impossible d'associer avec certitude une catégorie à un numéro de cluster.

In [24]:
fig, ax = plt.subplots(figsize=(15,15))
plt.title('Visualisation Prédiction avec entrainement Texte et Image', fontweight='bold')
ax.scatter(X_tsne_VGG[data.index, 0], X_tsne_VGG[data.index, 1])
for x0, y0, path in zip(X_tsne_VGG[data.index, 0], X_tsne_VGG[data.index, 1],
                        (f'Flipkart/Images/' + data.image)):
    ab = AnnotationBbox(OffsetImage(plt.imread(path), zoom=0.025), (x0, y0), frameon=False)
    ax.add_artist(ab)
plt.xlabel('TSNE 1')
plt.ylabel('TSNE 2')
plt.show()

La classification à partir des données textes (description) lemmatiser et images avec traitement vgg donne des résultats satisfaisants.